Packages

Load packages

library(dplyr)
library(ggplot2)
library(leaflet) 
library(sf) # handle simple features objects
library(spdep) # areal analysis
library(proj4)
library(maptools)
library(geoR)
library(spgwr)
library(nlme)
library(pgirmess)

Areal Data

Data downloaded from https://github.com/coreysparks/data

#mort<-st_read("/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Data/usdata_mort.shp")
#mort_sp <- sf:::as_Spatial(mort$geom)
# don't know projection so use readShapePoly
mort_sp <-readShapePoly("/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Data/usdata_mort.shp")

library(rgeos) # use this to get centroid coordinates of the counties
mort_coords<-data.frame(gCentroid(mort_sp,byid=TRUE))
#mort.pal = colorNumeric(c('blue','purple','pink','yellow'),
#                      domain=mort$mortrate)
#mort %>% 
#  leaflet() %>% 
#  addProviderTiles('CartoDB.Positron') %>% 
#  addPolygons(fillColor=~mort.pal(mortrate), weight=.5, fillOpacity=.9,
#              label=~paste0(NAME, ': ', poptot)) %>%
#  addLegend("bottomleft", pal = mort.pal, values = ~mortrate, title = "County Mortality Rates", opacity = 1)

spplot(mort_sp,"mortrate", at=quantile(mort_sp$mortrate), col.regions=heat.colors(10), main="Spatial Distribution of US Mortality Rate")

Defining neighbours, weights

Let’s use 2-NN as our starting place to look at the global Moran’s I for Mortality Rates (use lag plot)

mort_nb2<-knearneigh(coordinates(mort_sp), k=2)
mort_nb2<-knn2nb(mort_nb2)
mort_wt2<-nb2listw(mort_nb2, style="W")

mort_corrneigh<-sp.correlogram(mort_nb2, mort_sp$mortrate, order=10, method="I",zero.policy = TRUE)
plot(mort_corrneigh, main="Moran's I mortality rate")

# this takes a while to run
mort_correlog<-correlog((mort_coords),mort_sp$mortrate,method="Moran",nbclass=10)
plot(mort_correlog)

Linear model (OLS)

Look at a linear model and the spatial distribution of the residuals

mod_lm<-lm(scale(mortrate)~ppersonspo+p65plus+pwhite+pblack_1+phisp+punemp_1, data=mort_sp)
summary(mod_lm)
## 
## Call:
## lm(formula = scale(mortrate) ~ ppersonspo + p65plus + pwhite + 
##     pblack_1 + phisp + punemp_1, data = mort_sp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5903 -0.4477  0.0251  0.4782  4.1662 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.3411     0.2122  -6.321 2.97e-10 ***
## ppersonspo    7.8135     0.3237  24.136  < 2e-16 ***
## p65plus      -2.1017     0.3679  -5.713 1.22e-08 ***
## pwhite        0.4330     0.2103   2.059   0.0396 *  
## pblack_1      1.2972     0.2081   6.232 5.23e-10 ***
## phisp        -2.1099     0.1405 -15.021  < 2e-16 ***
## punemp_1      3.4119     0.7665   4.451 8.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7626 on 3060 degrees of freedom
## Multiple R-squared:  0.4196, Adjusted R-squared:  0.4185 
## F-statistic: 368.7 on 6 and 3060 DF,  p-value: < 2.2e-16
# map residuals

mort_sp$lm_resid<-mod_lm$residuals
spplot(mort_sp,"lm_resid", at=quantile(mort_sp$lm_resid), col.regions=heat.colors(10), main="Resids from OLS")

moran.test(mort_sp$lm_resid, listw=mort_wt2)
## 
##  Moran I test under randomisation
## 
## data:  mort_sp$lm_resid  
## weights: mort_wt2    
## 
## Moran I statistic standard deviate = 25.491, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.4273265616     -0.0003261579      0.0002814597

SAR Error Model

  • This applies spatial weights to both y and x
  • Errors are the residuals and spatial weights \(\rho\) are applied to these \(\rho(X\beta -Y)\).
  • R output calls spatial weights \(\lambda\)
mod_sar1 <- errorsarlm(scale(mortrate)~ppersonspo+p65plus+pwhite+pblack_1+phisp+punemp_1, data = mort_sp, listw=mort_wt2, zero.policy=T, tol.solve=1e-12)
summary(mod_sar1)
## 
## Call:
## spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.6838336 -0.3671099  0.0083942  0.3956529  4.1889756 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error  z value  Pr(>|z|)
## (Intercept)  0.038914   0.230723   0.1687    0.8661
## ppersonspo   4.969503   0.347972  14.2813 < 2.2e-16
## p65plus      0.400593   0.388857   1.0302    0.3029
## pwhite      -1.163995   0.227124  -5.1249 2.976e-07
## pblack_1     0.098609   0.233750   0.4219    0.6731
## phisp       -1.966691   0.178364 -11.0263 < 2.2e-16
## punemp_1     5.108868   0.750308   6.8090 9.826e-12
## 
## Lambda: 0.48691, LR test value: 717.3, p-value: < 2.22e-16
## Asymptotic standard error: 0.015327
##     z-value: 31.769, p-value: < 2.22e-16
## Wald statistic: 1009.3, p-value: < 2.22e-16
## 
## Log likelihood: -3158.435 for error model
## ML residual variance (sigma squared): 0.41747, (sigma: 0.64612)
## Number of observations: 3067 
## Number of parameters estimated: 9 
## AIC: 6334.9, (AIC for lm: 7050.2)
mort_sp$res_sar1<-residuals(mod_sar1)

spplot(mort_sp,"res_sar1", at=quantile(mort_sp$res_sar1), col.regions=heat.colors(10), main="Resids from SAR error model")

moran.test(mort_sp$res_sar1, listw=mort_wt2)
## 
##  Moran I test under randomisation
## 
## data:  mort_sp$res_sar1  
## weights: mort_wt2    
## 
## Moran I statistic standard deviate = -4.5154, p-value = 1
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0760691917     -0.0003261579      0.0002813806
# MCMC Moran's test giving us 1000 Moran's I statistics (better when # polygons small)
moran_mc_res1<- moran.mc(mort_sp$res_sar1,mort_wt2, zero.policy=T,nsim=999) 
dist999<- hist(moran_mc_res1$res,freq=TRUE,col="light blue",main="Permutation Test for Moran's I - 999 permutations",breaks=75)
lines(moran_mc_res1$statistic,max(dist999$counts),type="h",col="red",lwd=2)

SAR Lag model

  • SAR “lag”" model different function - only applies spatial weights to y
mod_sar2 = lagsarlm(scale(mortrate)~ppersonspo+p65plus+pwhite+pblack_1+phisp+punemp_1, data = mort_sp, mort_wt2, zero.policy=T, tol.solve=1.0e-12)
summary(mod_sar2)
## 
## Call:spatialreg::lagsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, type = type, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3.466024 -0.373268  0.021276  0.392352  4.117610 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##             Estimate Std. Error  z value  Pr(>|z|)
## (Intercept) -0.34735    0.18235  -1.9049   0.05680
## ppersonspo   4.71672    0.28982  16.2748 < 2.2e-16
## p65plus     -0.63477    0.31494  -2.0155   0.04385
## pwhite      -0.42313    0.17993  -2.3516   0.01869
## pblack_1     0.17723    0.17956   0.9870   0.32363
## phisp       -1.52923    0.12257 -12.4769 < 2.2e-16
## punemp_1     3.68583    0.65620   5.6169 1.944e-08
## 
## Rho: 0.4247, LR test value: 737, p-value: < 2.22e-16
## Asymptotic standard error: 0.014702
##     z-value: 28.888, p-value: < 2.22e-16
## Wald statistic: 834.51, p-value: < 2.22e-16
## 
## Log likelihood: -3148.583 for lag model
## ML residual variance (sigma squared): 0.42507, (sigma: 0.65198)
## Number of observations: 3067 
## Number of parameters estimated: 9 
## AIC: 6315.2, (AIC for lm: 7050.2)
## LM test for residual autocorrelation
## test value: 21.042, p-value: 4.4935e-06
mort_sp$res_sar2<-residuals(mod_sar2)
spplot(mort_sp,"res_sar2", at=quantile(mort_sp$res_sar2), col.regions=heat.colors(10), main="Resids from SAR lag model")

moran.test(mort_sp$res_sar2, listw=mort_wt2)
## 
##  Moran I test under randomisation
## 
## data:  mort_sp$res_sar2  
## weights: mort_wt2    
## 
## Moran I statistic standard deviate = -2.2222, p-value = 0.9869
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0376033285     -0.0003261579      0.0002813869

CAR model

  • Conditional autoregressive models require binary weight matrices
  • These models are often better at detecting localized spatial patterns
mort_nb2b<-knearneigh(coordinates(mort_sp), k=2)
mort_nb2b<-knn2nb(mort_nb2b)
mort_wt2b<-nb2listw(mort_nb2b, style="B")

mod_car <- spautolm(scale(mortrate)~ppersonspo+p65plus+pwhite+pblack_1+phisp+punemp_1,data=mort_sp, listw=mort_wt2b, zero.policy=T, tol.solve=1e-12,family="CAR")
summary(mod_car)
## 
## Call: spatialreg::spautolm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, family = family, method = method, 
##     verbose = verbose, trs = trs, interval = interval, zero.policy = zero.policy, 
##     tol.solve = tol.solve, llprof = llprof, control = control)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -3.685178 -0.363960  0.010699  0.369970  4.095396 
## 
## Coefficients: 
##             Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  0.72672    0.23611  3.0778  0.002085
## ppersonspo   3.72883    0.35818 10.4104 < 2.2e-16
## p65plus      1.62690    0.39093  4.1616 3.160e-05
## pwhite      -1.94055    0.22970 -8.4481 < 2.2e-16
## pblack_1    -0.72090    0.24546 -2.9370  0.003314
## phisp       -1.75413    0.20580 -8.5235 < 2.2e-16
## punemp_1     5.78943    0.73706  7.8547 3.997e-15
## 
## Lambda: 0.46282 LR test value: 1001.7 p-value: < 2.22e-16 
## Numerical Hessian standard error of lambda: 0.0052053 
## 
## Log likelihood: -3016.233 
## ML residual variance (sigma squared): 0.32392, (sigma: 0.56914)
## Number of observations: 3067 
## Number of parameters estimated: 9 
## AIC: 6050.5
mort_sp$res_car<-residuals(mod_car)
spplot(mort_sp,"res_car", at=quantile(mort_sp$res_car), col.regions=heat.colors(10), main="Resids from CAR model")

res_car <- residuals(mod_car)

# Moran's test
moran_resc<-moran.test(res_car, listw=mort_wt2b)

Mixed Effects

  • Add random effects for the spatial unit (county).
  • Introduce a spatial term in the coordinates, here the centroid of the counties. This is similar to geostatistics.
  • Here we use SIDS data to run mixed effects model because it is computationally intensive using 3,000+ counties in the mortality data.
  • we will look at the association between proportion of sids cases and proportion of non-white births
nc <- readShapeSpatial(system.file("shapes/sids.shp", package = "spData")[1])

nc$lat<-coordinates(nc)[,2]
nc$lon<-coordinates(nc)[,1]

sids_rate = (nc$SID79)/nc$BIR79
nwbirth_rate = (nc$NWBIR79)/nc$BIR79

spatial<-corSpatial(1,form=~lon+lat,type="gaussian") # like choosing a gaussian covariance function
scor<-Initialize(spatial, as(nc, "data.frame")[,c("lon","lat")], nugget=FALSE)

mod_mixed<-lme(sids_rate ~ nwbirth_rate, random= ~1|CNTY_ID, data=as(nc,"data.frame"), correlation=scor, method="ML", control=lmeControl(singular.ok=TRUE, returnObject = TRUE))
summary(mod_mixed)
## Linear mixed-effects model fit by maximum likelihood
##  Data: as(nc, "data.frame") 
##         AIC       BIC   logLik
##   -1053.397 -1040.371 531.6986
## 
## Random effects:
##  Formula: ~1 | CNTY_ID
##          (Intercept)    Residual
## StdDev: 9.280348e-05 0.001183845
## 
## Correlation Structure: Gaussian spatial correlation
##  Formula: ~lon + lat | CNTY_ID 
##  Parameter estimate(s):
##      range 
## 0.02882469 
## Fixed effects: sids_rate ~ nwbirth_rate 
##                    Value    Std.Error DF  t-value p-value
## (Intercept)  0.001727026 0.0002155968 98 8.010445  0.0000
## nwbirth_rate 0.001004548 0.0005770555 98 1.740817  0.0849
##  Correlation: 
##              (Intr)
## nwbirth_rate -0.831
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -1.93110059 -0.61538259 -0.01740905  0.43223963  3.22018603 
## 
## Number of Observations: 100
## Number of Groups: 100
res_mixed<-residuals(mod_mixed, type="response")

nc$res_mixed<-res_mixed
spplot(nc,"res_mixed", at=quantile(nc$res_mixed), col.regions=heat.colors(10), main="Resids from Mixed Effects model")

# Moran's I of residuals
sids_nb <- poly2nb(nc, queen=T)
sids_mat_w <- nb2listw(sids_nb, style="B", zero.policy=TRUE)
moran.test(nc$res_mixed, listw=sids_mat_w)
## 
##  Moran I test under randomisation
## 
## data:  nc$res_mixed  
## weights: sids_mat_w    
## 
## Moran I statistic standard deviate = 1.8638, p-value = 0.03117
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.104349163      -0.010101010       0.003770746
# compare SIDS mixed effects with SAR model
sids_nb <- poly2nb(nc, queen=T)
sids_mat_w <- nb2listw(sids_nb, style="W", zero.policy=TRUE)
mod_sar_nc <- errorsarlm(sids_rate ~ nwbirth_rate, data = nc, listw=sids_mat_w, zero.policy=T, tol.solve=1e-12)
summary(mod_sar_nc)
## 
## Call:
## spatialreg::errorsarlm(formula = formula, data = data, listw = listw, 
##     na.action = na.action, Durbin = Durbin, etype = etype, method = method, 
##     quiet = quiet, zero.policy = zero.policy, interval = interval, 
##     tol.solve = tol.solve, trs = trs, control = control)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -2.2321e-03 -7.1518e-04 -1.9593e-05  5.0607e-04  3.9720e-03 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  0.00169814 0.00026671  6.3670 1.927e-10
## nwbirth_rate 0.00108402 0.00069006  1.5709    0.1162
## 
## Lambda: 0.27857, LR test value: 3.8702, p-value: 0.049149
## Asymptotic standard error: 0.13102
##     z-value: 2.1261, p-value: 0.033491
## Wald statistic: 4.5205, p-value: 0.033491
## 
## Log likelihood: 533.6337 for error model
## ML residual variance (sigma squared): 1.3327e-06, (sigma: 0.0011544)
## Number of observations: 100 
## Number of parameters estimated: 4 
## AIC: -1059.3, (AIC for lm: -1057.4)
moran.test(residuals(mod_sar_nc),sids_mat_w)
## 
##  Moran I test under randomisation
## 
## data:  residuals(mod_sar_nc)  
## weights: sids_mat_w    
## 
## Moran I statistic standard deviate = 0.23452, p-value = 0.4073
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.005034967      -0.010101010       0.004165434
#compare to lm
sids_lm<-lm(sids_rate ~ nwbirth_rate,data=nc)
summary(sids_lm)
## 
## Call:
## lm(formula = sids_rate ~ nwbirth_rate, data = nc)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0023002 -0.0007330 -0.0000207  0.0005148  0.0038356 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0017270  0.0002156   8.010 2.41e-12 ***
## nwbirth_rate 0.0010045  0.0005771   1.741   0.0849 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0012 on 98 degrees of freedom
## Multiple R-squared:   0.03,  Adjusted R-squared:  0.0201 
## F-statistic:  3.03 on 1 and 98 DF,  p-value: 0.08485
moran.test(sids_lm$residuals, listw=sids_mat_w)
## 
##  Moran I test under randomisation
## 
## data:  sids_lm$residuals  
## weights: sids_mat_w    
## 
## Moran I statistic standard deviate = 2.2233, p-value = 0.0131
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##        0.13367035       -0.01010101        0.00418154

GWR

  • Let us again use the SIDS data because this can take a long time to run
# Freeman-Tukey transformation of the response to make normal distribution
sids_ft = sqrt(1000)*(sqrt(nc$SID79/nc$BIR79) + sqrt((nc$SID79+1)/nc$BIR79))
hist(sids_ft)

# the distribution of the transformed values is more symmetric and there is only one outlier.

nwbirth_rate = (nc$NWBIR79)/nc$BIR79
hist(nwbirth_rate)

# Freeman-Tukey transformation of the covariate to make normal distribution
nwbirth_ft = sqrt(1000)*(sqrt(nc$NWBIR79/nc$BIR79) + sqrt((nc$NWBIR79+1)/nc$BIR79))
hist(nwbirth_ft)

# GWR takes two steps, 1) estimate bandwidth
sids_bwG<-gwr.sel(sids_ft~nwbirth_ft, data=nc, gweight=gwr.Gauss, verbose=T, adapt=T)
## Adaptive q: 0.381966 CV score: 74.61564 
## Adaptive q: 0.618034 CV score: 74.3972 
## Adaptive q: 0.763932 CV score: 74.37443 
## Adaptive q: 0.7297418 CV score: 74.37428 
## Adaptive q: 0.7453343 CV score: 74.37193 
## Adaptive q: 0.7465778 CV score: 74.3722 
## Adaptive q: 0.7409741 CV score: 74.37096 
## Adaptive q: 0.7366838 CV score: 74.37196 
## Adaptive q: 0.7410372 CV score: 74.37097 
## Adaptive q: 0.7393354 CV score: 74.37098 
## Adaptive q: 0.7402008 CV score: 74.37078 
## Adaptive q: 0.7398703 CV score: 74.37078 
## Adaptive q: 0.7400466 CV score: 74.37075 
## Adaptive q: 0.7400059 CV score: 74.37074 
## Adaptive q: 0.7399541 CV score: 74.37075 
## Adaptive q: 0.7400059 CV score: 74.37074
# 2) fit model
mod_gwr <- gwr(sids_ft ~ nwbirth_ft, data = nc, bandwidth=sids_bwG, gweight=gwr.Gauss, hatmatrix = TRUE)
# if you have a pre-defined bandwidth, you only need to do step 2.
mod_gwr
## Call:
## gwr(formula = sids_ft ~ nwbirth_ft, data = nc, bandwidth = sids_bwG, 
##     gweight = gwr.Gauss, hatmatrix = TRUE)
## Kernel function: gwr.Gauss 
## Fixed bandwidth: 0.7400059 
## Summary of GWR coefficient estimates at data points:
##                     Min.     1st Qu.      Median     3rd Qu.        Max.
## X.Intercept.  1.18515404  2.00169153  2.32139225  2.49758078  2.78788348
## nwbirth_ft   -0.00057442  0.01401919  0.01906047  0.02536294  0.04821502
##              Global
## X.Intercept. 2.5847
## nwbirth_ft   0.0099
## Number of data points: 100 
## Effective number of parameters (residual: 2traceS - traceS'S): 16.08104 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 83.91896 
## Sigma (residual: 2traceS - traceS'S): 0.8534923 
## Effective number of parameters (model: traceS): 11.96186 
## Effective degrees of freedom (model: traceS): 88.03814 
## Sigma (model: traceS): 0.8332863 
## Sigma (ML): 0.7818612 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 264.7026 
## AIC (GWR p. 96, eq. 4.22): 246.534 
## Residual sum of squares: 61.13069 
## Quasi-global R2: 0.1612005
# Localized Coefficients from GWR

coef <- mod_gwr$SDF$nwbirth_ft
nc$gwr_coef<-coef

spplot(nc,"gwr_coef", at=quantile(nc$gwr_coef), col.regions=heat.colors(10), main="GWR local beta coefficient estimates (non-white births)")

# plot local R^2
localR2<-mod_gwr$SDF$localR2
nc$gwr_localR2<-localR2
spplot(nc,"gwr_localR2", at=quantile(nc$gwr_localR2), col.regions=heat.colors(10), main="GWR local R2")

# Residuals of GWR to see if there is residual spatial autocorrelation

gwr_res <- mod_gwr$SDF$gwr.e
nc$gwr_res<-gwr_res

spplot(nc,"gwr_res", at=quantile(nc$gwr_res), col.regions=heat.colors(10), main="GWR Residuals")

# Test residual autocorrelation with Moran's
moran.test(nc$gwr_res, listw=sids_mat_w)
## 
##  Moran I test under randomisation
## 
## data:  nc$gwr_res  
## weights: sids_mat_w    
## 
## Moran I statistic standard deviate = 0.56711, p-value = 0.2853
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##        0.02660716       -0.01010101        0.00418982